Data visualisation


Registration number: 19BCE1717
Faculty: Prof. Parvathi R
Slot: L55 + L56
Course code: CSE3020


Instructions:

Explore the ggplot2 package in R using a dataset of your choice and the midwest dataset used in class. Run the code given in class

Sections:


PART 1: Survey dataset

Preprocessing and data cleaning

Import required package and dataset:
#Packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(MASS)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Dataset from the MASS package -> survey
# I chose this because visualising the survey based on many attributes provides more options and flexibility to explore ggplot2 package. 
data("survey")
data <- survey
head(data)
##      Sex Wr.Hnd NW.Hnd W.Hnd    Fold Pulse    Clap Exer Smoke Height      M.I
## 1 Female   18.5   18.0 Right  R on L    92    Left Some Never 173.00   Metric
## 2   Male   19.5   20.5  Left  R on L   104    Left None Regul 177.80 Imperial
## 3   Male   18.0   13.3 Right  L on R    87 Neither None Occas     NA     <NA>
## 4   Male   18.8   18.9 Right  R on L    NA Neither None Never 160.00   Metric
## 5   Male   20.0   20.0 Right Neither    35   Right Some Never 165.00   Metric
## 6 Female   18.0   17.7 Right  L on R    64   Right Some Never 172.72 Imperial
##      Age
## 1 18.250
## 2 17.583
## 3 16.917
## 4 20.333
## 5 23.667
## 6 21.000
Exploring the dataset

Structure:

str(data)
## 'data.frame':    237 obs. of  12 variables:
##  $ Sex   : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
##  $ Wr.Hnd: num  18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
##  $ NW.Hnd: num  18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
##  $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Fold  : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
##  $ Pulse : int  92 104 87 NA 35 64 83 74 72 90 ...
##  $ Clap  : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
##  $ Exer  : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
##  $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ Height: num  173 178 NA 160 165 ...
##  $ M.I   : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
##  $ Age   : num  18.2 17.6 16.9 20.3 23.7 ...

Dimension:

dim(data)
## [1] 237  12

Columns or attributes:

colnames(data)
##  [1] "Sex"    "Wr.Hnd" "NW.Hnd" "W.Hnd"  "Fold"   "Pulse"  "Clap"   "Exer"  
##  [9] "Smoke"  "Height" "M.I"    "Age"
Data cleaning
print(paste("Number of missing values = ", sum(is.na(data))))
## [1] "Number of missing values =  107"

NOTE: There are many missing values that will be filled up using data imputation techniques of mean, median and mode.

Missing data imputation
print(paste("Mean value of the column Wr.Hnd = ", mean(data$Wr.Hnd,na.rm = TRUE)))
## [1] "Mean value of the column Wr.Hnd =  18.6690677966102"
print(paste("Median value of Wr.Hnd = ", median(data$Wr.Hnd,na.rm = TRUE)))
## [1] "Median value of Wr.Hnd =  18.5"
print(paste("Maximum value of Wr.Hnd = ", max(data$Wr.Hnd,na.rm = TRUE)))
## [1] "Maximum value of Wr.Hnd =  23.2"
print(paste("Minimum value of Wr.Hnd = ", min(data$Wr.Hnd,na.rm = TRUE)))
## [1] "Minimum value of Wr.Hnd =  13"

The mean and median values are very close while min and max values are having a wider difference in magnitude. Hence, the use of mean or median in substituting for a NA value in this column is a good practice. After repeating this for other columns, it was found that mean/median imputation is the best choice for other columns too (when the datatype is int or numberic). I chose Mode for categorical type attributes.

Replace numerical missig values:

# used median value to replace the missing values
data <- data %>%
  mutate(Wr.Hnd = replace(Wr.Hnd, is.na(Wr.Hnd), median(Wr.Hnd, na.rm = TRUE)))
data <- data %>%
  mutate(NW.Hnd = replace(NW.Hnd, is.na(NW.Hnd), median(NW.Hnd, na.rm = TRUE)))
data <- data %>%
  mutate(Pulse = replace(Pulse, is.na(Pulse), median(Pulse, na.rm = TRUE)))

# used mean value to replace the missing values
data <- data %>%
  mutate(Height = replace(Height, is.na(Height), mean(Height, na.rm = TRUE)))
data <- data %>%
  mutate(Age = replace(Age, is.na(Age), mean(Age, na.rm = TRUE)))

Replace categorical missing values:

# Build a function that returns the mode of the column:
get_mode <- function(x){
  distinct_values <- unique(x)
  distinct_tabulate <- tabulate(match(x, distinct_values))
  distinct_values[which.max(distinct_tabulate)]
}
# Used mode values to replace the NA values
data <- data %>% 
  mutate(M.I = if_else(is.na(M.I), get_mode(M.I), M.I))
data <- data %>% 
  mutate(W.Hnd = if_else(is.na(W.Hnd), get_mode(W.Hnd), W.Hnd))
data <- data %>% 
  mutate(Clap = if_else(is.na(Clap), get_mode(Clap), Clap))
data <- data %>% 
  mutate(Smoke = if_else(is.na(Smoke), get_mode(Smoke), Smoke))
data <- data %>% 
  mutate(Exer = if_else(is.na(Exer), get_mode(Exer), Exer))
data <- data %>% 
  mutate(Sex = if_else(is.na(Sex), get_mode(Sex), Sex))
Cleaned data ready for visualisation:
print(paste("Number of missing values = ", sum(is.na(data))))
## [1] "Number of missing values =  0"

Visualisation using ggplot2

Scatterplot + encircle
library(ggalt)
## Warning: package 'ggalt' was built under R version 4.0.2
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
data_select <- data[data$Age > 18 & 
                            data$Age <= 20 & 
                            data$Height > 150 & 
                            data$Height < 180, ]
gg1<-ggplot(data, aes(x=Height, y=Age)) + 
  geom_point(aes(col=Sex, size=Pulse)) +   # draw points
  geom_smooth(method="loess", se=F) + 
  geom_encircle(aes(x=Height, y=Age), 
                data=data_select, 
                color="red", 
                size=2, 
                expand=0.08) +   # encircle
  labs(subtitle="Height Vs Age", 
       y="Age", 
       x="Height", 
       title="Scatterplot + Encircle", 
       caption="Source: data")
plot(gg1)
## `geom_smooth()` using formula 'y ~ x'

Geom_smooth line
g <- ggplot(data, aes(Wr.Hnd, NW.Hnd))
g + geom_point() + 
  geom_smooth(method="lm", se=F) +
    labs(subtitle="Wr.Hnd vs NW.Hnd", 
       y="Wr.Hnd", 
       x="NW.Hnd", 
       title="Scatterplot with overlapping points", 
       caption="Source: data")
## `geom_smooth()` using formula 'y ~ x'

Jittered points
g <- ggplot(data, aes(Wr.Hnd, NW.Hnd))
g + geom_jitter(width = .5, size=1) +
  labs(subtitle="Survey data: Wr.Hnd vs NW.Hnd", 
       y="Wr.Hnd", 
       x="NW.Hnd", 
       title="Jittered Points")

Scatterplot (counts plot)
# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(data, aes(Wr.Hnd, NW.Hnd))
g + geom_count(col="tomato3", show.legend=F) +
  labs(subtitle="Survey data: Wr.Hnd vs NW.Hnd", 
       y="Wr.Hnd", 
       x="NW.Hnd", 
       title="Counts Plot")

Bubble Plot
# Bubble Plot
data_select <- data[data$Smoke %in% c("Occas", "Regul"), ]
g <- ggplot(data_select, aes(Height, Age)) + 
  labs(subtitle="Survey data: Height vs Age",
       title="Bubble chart")

g + geom_jitter(aes(col=Smoke, size=Wr.Hnd)) + 
  geom_smooth(aes(col=Smoke), method="lm", se=F)
## `geom_smooth()` using formula 'y ~ x'

Marginal Histogram / Boxplot
# Marginal Histogram / Boxplot
qplot(data$Pulse, geom="histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=data, aes(Pulse)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(data$Pulse,
      geom="histogram",
      binwidth = 1,
      main = "Histogram for Pulse", 
      xlab = "Age",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.4))

ggplot(data=data, aes(Pulse)) + 
  geom_histogram(breaks=seq(10, 50, by=2), 
                 col="red", 
                 aes(fill=..count..)) +
  scale_fill_gradient("Count", low="green", high="red")+labs(title="Histogram for data", x="Pulse", y="Count")

ggplot(data=data, aes(Pulse)) + 
  geom_histogram(aes(y =..density..), 
                 breaks=seq(10, 50, by = 2), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + 
  geom_density(col=2)

Stacked Bar
#Stacked Bar
g <- ggplot(data, aes(Age)) + scale_fill_brewer(palette = "Spectral")

g + geom_histogram(aes(fill=Smoke), 
                   binwidth = .1, 
                   col="black", 
                   size=.1) +  # change binwidth
  labs(title="Histogram with Auto Binning", 
       subtitle="Smokers across Age")  

g + geom_histogram(aes(fill=Smoke), 
                   bins=5, 
                   col="black", 
                   size=.1) +   # change number of bins
  labs(title="Histogram with Fixed Bins", 
       subtitle="Smokers across Age") 

g <- ggplot(data, aes(Clap))
g + geom_bar(aes(fill=Smoke), width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Histogram on Categorical Variable", 
       subtitle="Smokers across various type of clappers") 

Density plot
# Density plot
g <- ggplot(data, aes(Pulse))
g + geom_density(aes(fill=factor(Sex)), alpha=0.8) + 
  labs(title="Density plot", 
       subtitle="Pulse grouped by Sex",
       caption="Source: survey data",
       x="Pulse",
       fill="Sex")

Box Plot
# Box Plot
g <- ggplot(data, aes(Smoke, Pulse))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(title="Box plot", 
       subtitle="Pulse grouped by Smokers",
       caption="Source: survey data",
       x="Smokers",
       y="Pulse")

Correlation
# correlation plot
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.2
# Correlation matrix
corr <- round(cor(data[,c(2,3,6,10,12)]), 1)
corr
##        Wr.Hnd NW.Hnd Pulse Height  Age
## Wr.Hnd    1.0    0.9   0.0    0.6  0.0
## NW.Hnd    0.9    1.0   0.0    0.6  0.1
## Pulse     0.0    0.0   1.0   -0.1 -0.1
## Height    0.6    0.6  -0.1    1.0  0.0
## Age       0.0    0.1  -0.1    0.0  1.0
Plot
# Plot
ggcorrplot(corr)

ggcorrplot(corr, method = "circle")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

ggcorrplot(corr, hc.order = TRUE, outline.col = "white")

# Types of correlogram layout
# --------------------------------
# Get the lower triangle
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           outline.col = "white")

# Get the upeper triangle
ggcorrplot(corr, hc.order = TRUE, type = "upper",
           outline.col = "white")

# Change colors and theme
# --------------------------------
# Argument colors
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           outline.col = "white",
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))

ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of mtcars", 
           ggtheme=theme_bw)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Diverging Barcharts
#Diverging bars
data1 = data[c(1:50), ]
data1$Sex <- rownames(data1)  # create new column for car names
data1$Height_z <- round((data1$Height - mean(data1$Height))/sd(data1$Height), 2)  # compute normalized mpg
data1$Height_type <- ifelse(data1$Height_z < 0, "below", "above")  # above / below avg flag
data1 <- data1[order(data1$Height_z), ]  # sort
data1$Sex <- factor(data1$Sex, levels = data1$Sex)  # convert to factor to retain sorted order in plot.

# Diverging Barcharts
ggplot(data1, aes(x=Sex, y=Height_z, label=Height_z)) + 
  geom_bar(stat='identity', aes(fill=Height_type), width=.5)  +
  scale_fill_manual(name="Height", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#00ba38", "below"="#f8766d")) + 
  labs(subtitle="Normalised Height from 'Survey'", 
       title= "Diverging Bars") + 
  coord_flip()  

Diverging Lollipop Chart
# Diverging Lollipop Chart
ggplot(data1, aes(x=Sex, y=Height_z, label=Height_z)) + 
  geom_point(stat='identity', fill="black", size=6)  +
  geom_segment(aes(y = 0, 
                   x = Sex, 
                   yend = Height_z, 
                   xend = Sex), 
               color = "black") +
  geom_text(color="white", size=2) +
  labs(title="Diverging Lollipop Chart", 
       subtitle="Normalized height from 'survey data': Lollipop") + 
  ylim(-2.5, 2.5) +
  coord_flip()

Diverging Dot Plot
#Diverging Dot Plot
ggplot(data1, aes(x=Sex, y=Height_z, label=Height_z)) + 
  geom_point(stat='identity', aes(col=Height_type), size=6)  +
  scale_color_manual(name="Height", 
                     labels = c("Above Average", "Below Average"), 
                     values = c("above"="#00ba38", "below"="#f8766d")) + 
  geom_text(color="white", size=2) +
  labs(title="Diverging Dot Plot", 
       subtitle="Normalized height from 'survey data': Dotplot") + 
  ylim(-2.5, 2.5) +
  coord_flip()

Pie chart
# pie chart
pie <- ggplot(data, aes(x = "", fill = factor(Smoke))) + 
  geom_bar(width = 1) +
  theme(axis.line = element_blank(), 
        plot.title = element_text(hjust=0.5)) + 
  labs(fill="Smoke", 
       x=NULL, 
       y=NULL, 
       title="Pie Chart of Smokers", 
       caption="Source: survey data")

pie + coord_polar(theta = "y", start=0)

Violin Plot
# Violin Plot
g <- ggplot(data, aes(Smoke, Pulse))
g + geom_violin() + 
  labs(title="Violin plot", 
       subtitle="Smokers vs Pulse",
       caption="Source: survey data",
       x="Smokers",
       y="Pulse")

Boxplot
# Boxplot
g <- ggplot(data, aes(Smoke, Pulse))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(title="Box plot", 
       subtitle="Pulse grouped by Smokers",
       caption="Source: survey data",
       x="Smokers",
       y="Pulse")

Boxplot
# Boxplot
g <- ggplot(data, aes(Smoke, Pulse))
g + geom_boxplot(aes(fill=factor(Sex))) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot", 
       subtitle="Pulse grouped by Smokers",
       caption="Source: survey data",
       x="Smokers",
       y="Pulse")

Dot + Box Plot
#Dot + Box Plot
g <- ggplot(data, aes(Smoke, Pulse))
g + geom_boxplot() + 
  geom_dotplot(binaxis='y', 
               stackdir='center', 
               dotsize = .5, 
               fill="red") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot + Dot plot", 
       subtitle="Pulse vs Smokers: Each dot represents 1 row in source data",
       caption="Source: survey data",
       x="Smokers",
       y="Pulse")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Treemap
library(treemapify)
## Warning: package 'treemapify' was built under R version 4.0.2
ggplot(data, aes(area=Pulse, fill=Smoke, subgroup=Smoke)) + 
  geom_treemap()

ggplot(data, aes(area=Pulse, fill=Smoke, subgroup=Smoke)) + 
  geom_treemap()+
  #main group bordering
  geom_treemap_subgroup_border()+
  #subgroup heading in white
  geom_treemap_subgroup_text(color="white")+
  #all other group text in black
  geom_treemap_text(aes(label=Clap), color="black")+
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_brewer(palette = "Dark2")


PART 2: MTCars and Midwest dataset (from class)


Import libraries and dataset
library(ggplot2)
theme_set(theme_bw())  # pre-set the bw theme.
data("midwest")
midwest
## # A tibble: 437 × 28
##      PID county    state  area poptotal popdensity popwhite popblack popamerindian
##    <int> <chr>     <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
##  1   561 ADAMS     IL    0.052    66090      1271.    63917     1702            98
##  2   562 ALEXANDER IL    0.014    10626       759      7054     3496            19
##  3   563 BOND      IL    0.022    14991       681.    14477      429            35
##  4   564 BOONE     IL    0.017    30806      1812.    29344      127            46
##  5   565 BROWN     IL    0.018     5836       324.     5264      547            14
##  6   566 BUREAU    IL    0.05     35688       714.    35157       50            65
##  7   567 CALHOUN   IL    0.017     5322       313.     5298        1             8
##  8   568 CARROLL   IL    0.027    16805       622.    16519      111            30
##  9   569 CASS      IL    0.024    13437       560.    13384       16             8
## 10   570 CHAMPAIGN IL    0.058   173025      2983.   146506    16559           331
## # … with 427 more rows, and 19 more variables: popasian <int>, popother <int>,
## #   percwhite <dbl>, percblack <dbl>, percamerindan <dbl>, percasian <dbl>,
## #   percother <dbl>, popadults <int>, perchsd <dbl>, percollege <dbl>,
## #   percprof <dbl>, poppovertyknown <int>, percpovertyknown <dbl>,
## #   percbelowpoverty <dbl>, percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>
Scatterplot
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) + 
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot", 
       caption = "Source: midwest")
plot(gg)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Scatterplot + Encircle
library(ggalt)
midwest_select <- midwest[midwest$poptotal > 350000 & 
                            midwest$poptotal <= 500000 & 
                            midwest$area > 0.01 & 
                            midwest$area < 0.1, ]
gg1<-ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) +   # draw points
  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) +   # draw smoothing line
  geom_encircle(aes(x=area, y=poptotal), 
                data=midwest_select, 
                color="red", 
                size=2, 
                expand=0.08) +   # encircle
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot + Encircle", 
       caption="Source: midwest")
plot(gg1)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

Scatterplot with overlapping points
g <- ggplot(midwest, aes(popwhite, popblack))
g + geom_point() + 
  geom_smooth(method="lm", se=F) +
    labs(subtitle="White  vs Black population", 
       y="popwhite", 
       x="popblack", 
       title="Scatterplot with overlapping points", 
       caption="Source: midwest")
## `geom_smooth()` using formula 'y ~ x'

Jittered Points
data(mpg, package="ggplot2")
g <- ggplot(mpg, aes(cty, hwy))
g + geom_jitter(width = .5, size=1) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Jittered Points")

Scatterplot
# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_count(col="tomato3", show.legend=F) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Counts Plot")

Bubble Plot
# Bubble Plot
mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ]
g <- ggplot(mpg_select, aes(displ, cty)) + 
  labs(subtitle="mpg: Displacement vs City Mileage",
       title="Bubble chart")

g + geom_jitter(aes(col=manufacturer, size=hwy)) + 
  geom_smooth(aes(col=manufacturer), method="lm", se=F)
## `geom_smooth()` using formula 'y ~ x'

# Marginal Histogram / Boxplot
data(mpg)
qplot(mpg$hwy, geom="histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=mpg, aes(mpg$hwy)) + 
  geom_histogram()
## Warning: Use of `mpg$hwy` is discouraged. Use `hwy` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(mpg$hwy,
      geom="histogram",
      binwidth = 5,  
      main = "Histogram for hwy", 
      xlab = "Age",  
      fill=I("blue"), 
      col=I("red"), 
      alpha=I(.4),
      xlim=c(10,50))
## Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(data=mpg, aes(mpg$hwy)) + 
  geom_histogram(breaks=seq(10, 50, by=2), 
                 col="red", 
                 aes(fill=..count..)) +
  scale_fill_gradient("Count", low="green", high="red")+labs(title="Histogram for mpg", x="hwy", y="Count")
## Warning: Use of `mpg$hwy` is discouraged. Use `hwy` instead.

ggplot(data=mpg, aes(mpg$hwy)) + 
  geom_histogram(aes(y =..density..), 
                 breaks=seq(10, 50, by = 2), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + 
  geom_density(col=2)
## Warning: Use of `mpg$hwy` is discouraged. Use `hwy` instead.

## Warning: Use of `mpg$hwy` is discouraged. Use `hwy` instead.

Stacked Bar
#Stacked Bar
g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")

g + geom_histogram(aes(fill=class), 
                   binwidth = .1, 
                   col="black", 
                   size=.1) +  # change binwidth
  labs(title="Histogram with Auto Binning", 
       subtitle="Engine Displacement across Vehicle Classes")  

g + geom_histogram(aes(fill=class), 
                   bins=5, 
                   col="black", 
                   size=.1) +   # change number of bins
  labs(title="Histogram with Fixed Bins", 
       subtitle="Engine Displacement across Vehicle Classes") 

g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill=class), width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Histogram on Categorical Variable", 
       subtitle="Manufacturer across Vehicle Classes") 

Density plot
# Density plot
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + 
  labs(title="Density plot", 
       subtitle="City Mileage Grouped by Number of cylinders",
       caption="Source: mpg",
       x="City Mileage",
       fill="# Cylinders")

# Box Plot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(title="Box plot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage")

Correlation
# correlation plot
library(ggcorrplot)

# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)
Plot
# Plot
ggcorrplot(corr)

ggcorrplot(corr, method = "circle")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

ggcorrplot(corr, hc.order = TRUE, outline.col = "white")

# Types of correlogram layout
# --------------------------------
# Get the lower triangle
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           outline.col = "white")

# Get the upeper triangle
ggcorrplot(corr, hc.order = TRUE, type = "upper",
           outline.col = "white")

# Change colors and theme
# --------------------------------
# Argument colors
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           outline.col = "white",
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))

ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of mtcars", 
           ggtheme=theme_bw)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Diverging Barcharts
#Diverging bars
data("mtcars")  # load data
mtcars$`car name` <- rownames(mtcars)  # create new column for car names
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2)  # compute normalized mpg
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above")  # above / below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ]  # sort
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`)  # convert to factor to retain sorted order in plot.

# Diverging Barcharts
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + 
  geom_bar(stat='identity', aes(fill=mpg_type), width=.5)  +
  scale_fill_manual(name="Mileage", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#00ba38", "below"="#f8766d")) + 
  labs(subtitle="Normalised mileage from 'mtcars'", 
       title= "Diverging Bars") + 
  coord_flip()  

Diverging Lollipop Chart
# Diverging Lollipop Chart
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + 
  geom_point(stat='identity', fill="black", size=6)  +
  geom_segment(aes(y = 0, 
                   x = `car name`, 
                   yend = mpg_z, 
                   xend = `car name`), 
               color = "black") +
  geom_text(color="white", size=2) +
  labs(title="Diverging Lollipop Chart", 
       subtitle="Normalized mileage from 'mtcars': Lollipop") + 
  ylim(-2.5, 2.5) +
  coord_flip()

Diverging Dot Plot
#Diverging Dot Plot
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) + 
  geom_point(stat='identity', aes(col=mpg_type), size=6)  +
  scale_color_manual(name="Mileage", 
                     labels = c("Above Average", "Below Average"), 
                     values = c("above"="#00ba38", "below"="#f8766d")) + 
  geom_text(color="white", size=2) +
  labs(title="Diverging Dot Plot", 
       subtitle="Normalized mileage from 'mtcars': Dotplot") + 
  ylim(-2.5, 2.5) +
  coord_flip()

Pie chart
# pie chart

pie <- ggplot(mpg, aes(x = "", fill = factor(class))) + 
  geom_bar(width = 1) +
  theme(axis.line = element_blank(), 
        plot.title = element_text(hjust=0.5)) + 
  labs(fill="class", 
       x=NULL, 
       y=NULL, 
       title="Pie Chart of class", 
       caption="Source: mpg")

pie + coord_polar(theta = "y", start=0)

Violin Plot
# Violin Plot
g <- ggplot(mpg, aes(class, cty))
g + geom_violin() + 
  labs(title="Violin plot", 
       subtitle="City Mileage vs Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage")

Boxplot
# Boxplot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(title="Box plot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage")

Boxplot
# Boxplot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(aes(fill=factor(cyl))) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage")

Dot + Box Plot
#Dot + Box Plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_boxplot() + 
  geom_dotplot(binaxis='y', 
               stackdir='center', 
               dotsize = .5, 
               fill="red") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot + Dot plot", 
       subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Treemap
proglangs <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/proglanguages.csv")
library(treemapify)

ggplot(proglangs, aes(area=value, fill=parent, subgroup=parent)) + 
  geom_treemap()

ggplot(proglangs, aes(area=value, fill=parent, subgroup=parent)) + 
  geom_treemap()+
  #main group bordering
  geom_treemap_subgroup_border()+
  #subgroup heading in white
  geom_treemap_subgroup_text(color="white")+
  #all other group text in black
  geom_treemap_text(aes(label=id), color="black")+
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  scale_fill_brewer(palette = "Dark2")